# 加载randomForest包
#if (!require(randomForest)) install.packages("randomForest")
library(randomForest)
library(caret)
library(pROC)
library(MatchIt)
library(dplyr)

# 设置工作目录并加载数据
setwd("${path}")



mydata <- read.csv("bibliometric_nuomotu.csv")


cols<-colnames(mydata)[3:38]
colnames(mydata)
cols
colsdou<- paste0("'",cols,"'")
independent_and<- paste0(cols,collapse = "+")
independent_dou<- paste0(colsdou,collapse = ",")
# 确保 group_best 是因子，并且值为 "event_0" 和 "event_1"
mydata$group_best <- factor(mydata$group_best, levels = c("0", "1"), labels = c("event_0", "event_1"))


# 使用 matchit 进行 1:3 匹配
m.out <- matchit(as.formula(paste0("group_best~",independent_and)), 
                 data = mydata, 
                 method = "nearest", 
                 ratio = 3)

# 提取匹配后的数据
matched_data <- match.data(m.out)
# 训练随机森林模型
model_rf <- randomForest(
  as.formula(paste0("group_best~",independent_and)), 
  data = matched_data,
  ntree = 500,
  mtry = 3,
  importance = TRUE
)

# 预测和评估
predictions_rf <- predict(model_rf, newdata = matched_data, type = "prob")[,2]
predicted_classes_rf <- factor(ifelse(predictions_rf >= 0.5, "event_1", "event_0"), levels = c("event_0", "event_1"))

# 添加预测结果到数据集
mydata_with_predictions_rf <- matched_data %>%
  mutate(
    group_best_probability = predictions_rf,
    group_best_predicted_class = predicted_classes_rf,
    group_best_score = NA  # RF没有评分卡
  )

# 计算 group_best 和 group_best_predicted_class 相同的数据数量
matching_rows <- mydata_with_predictions_rf %>%
  filter(group_best == group_best_predicted_class) %>%
  nrow()
# 打印结果
cat("Number of rows where group_best and group_best_predicted_class match:", matching_rows, "\n")
# 可选：计算准确率（匹配行数 / 总行数）
total_rows <- nrow(matched_data)
accuracy <- matching_rows / total_rows
# 打印结果
cat("Number of rows where group_best and group_best_predicted_class match:", accuracy, "\n")
# 这是建模集的结果
write.csv(mydata_with_predictions_rf, file = "mydata_with_predictions.csv", row.names = FALSE)

try({
#### 画图开始 
#### 特征重要性图 ####
# 提取特征重要性
importance_matrix <- importance(model_rf)

# 创建一个数据框，包含变量名称和它们的重要性得分
importance_df <- data.frame(
  Variable = rownames(importance_matrix),
  Importance = importance_matrix[, "MeanDecreaseGini"],
  stringsAsFactors = FALSE
)

# 按照重要性降序排序
importance_df <- importance_df[order(-importance_df$Importance), ]

# 绘制特征重要性图
library(ggplot2)
png(file="importanceFeatures_rf.png", width=800, height=800)
ggplot(importance_df, aes(x=reorder(Variable, Importance), y=Importance)) +
  geom_bar(stat="identity", fill="steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title="Variable Importance Plot",
       x="Variables",
       y="Importance Score") +
  theme(axis.text.y = element_text(size = 8)) # 如果有很多变量，可以调整字体大小以适应图表
dev.off()
#### 部分依赖图 ####
library(pdp)
# 获取特征名称列表
feature_names <- c(independent_dou)
# 为每个特征创建单变量部分依赖图
# for (feature in feature_names) {
#   # 创建PDP并绘图
#   pd <- partial(model_rf, pred.var = feature, plot = TRUE, train = matched_data, rug = TRUE)
#   png(file=paste0("partialDependencyGraph_",feature,"_rf.png"), width=800, height=800)
#   pd
#   dev.off()
# }
#### 混淆矩阵 ####
confusionMatrix(data = predicted_classes_rf, reference = matched_data$group_best)
sink("confusionMatrix_rf.txt")
# print(conf_matrix)
sink()
#### ROC曲线 ####
# ROC曲线
png(file="roc_rf.png", width=800, height=800)
roc_curve <- roc(matched_data$group_best, predictions_rf, plot=TRUE, print.auc=TRUE, col="blue", main="ROC Curve")
dev.off()
try({
  dev.off()
})


####画图结束
})

#### 森林图 不能在try 里面画图 画不出来 ####
library(forestplot)
png(file="forestplot_rf.png", width=800, height=800)
forestplot(labeltext=cbind(rownames(importance_df), round(importance_df$Importance, 2)), 
           mean=importance_df$Importance,
           lower=importance_df$Importance - sd(importance_df$Importance),
           upper=importance_df$Importance + sd(importance_df$Importance),
           main="Forest Plot of Variable Importance",
           zero=0,
           point.pch = 0,                                       # 空心圆点
           point.cex = 0.5,                                      # 点的大小
           col=fpColors(box="black", lines="black", text="black"))
dev.off()
try({
  dev.off()
})


mydata1 <- read.csv("bibliometric_nuomotuRes.csv")
# 预测和评估
predictions_rf <- predict(model_rf, newdata = mydata1, type = "prob")[,2]
predicted_classes_rf <- factor(ifelse(predictions_rf >= 0.5, "event_1", "event_0"), levels = c("event_0", "event_1"))

# 添加预测结果到数据集
result_with_predictions_rf <- mydata1 %>%
  mutate(
    group_best_probability = predictions_rf,
    group_best_predicted_class = predicted_classes_rf,
    group_best_score = NA  # RF没有评分卡
  )
# 这是结局的结果
write.csv(result_with_predictions_rf, file = "result_with_predictions.csv", row.names = FALSE)

#结束
